library(haven)
library(quanteda)
library(data.table)
library(stringr)
library(dplyr)
library(tidyr)

rm(list = ls())
setwd(".../replication kit/intermediate files and code")

data <- read_dta("fb tw micro data.dta", col_select = c(id, text))
data <- as.data.table(data)



###########################################################
# negative and positive words based on sentiWS dictionary #
###########################################################


doc.corpus <- corpus(data$text)

read_senti_scores <- function(filename) {
    
    results <- read.delim(filename, header = FALSE, encoding="UTF-8") %>%
        cbind(str_split_fixed(.$V3, "[,-]",50),stringsAsFactors = FALSE) %>%
        mutate(
            V1 = str_sub(str_match(V1,".*\\|"),1,-2),
            nr = row_number()
        ) %>%
        select(-V3) %>%
        mutate(nr = as.character(nr)) %>%
        gather(wordstem,word,V1,1:48, -nr,-V2) %>%
        select(word,V2) %>% rename(score=V2) %>%
        filter(word != "") %>%
        arrange(word)
    
}

positive <- read_senti_scores("SentiWS_v2.0_Positive.txt") %>% 
    mutate(sentiment = "positive")

negative <- read_senti_scores("SentiWS_v2.0_Negative.txt") %>% 
    mutate(sentiment = "negative") 

sentis <- bind_rows(positive, negative)

data_dictionary_sentiws <- as.dictionary(sentis)

sentiment <- dfm(doc.corpus, dictionary = data_dictionary_sentiws, stem = FALSE)
sentiment_df <- convert(sentiment, "data.frame")
sentiment_df$document <- NULL



##############################
# other text characteristics #
##############################

# post/tweet starts with question word
start_w_was <- as.factor(grepl("^was", tolower(data$text)))
start_w_wer <- as.factor(grepl("^wer",  tolower(data$text)))
start_w_wo <- as.factor(grepl("^wo",  tolower(data$text)))
start_w_wann <- as.factor(grepl("^wann",  tolower(data$text)))
start_w_warum <- as.factor(grepl("^warum",  tolower(data$text)))
start_w_wie <- as.factor(grepl("^wie",  tolower(data$text)))

# counts
number_of_words <- sapply(strsplit(tolower(data$text), "\\s+"), length)

mean_word_length <- nchar(data$text)/number_of_words

questionmarks=str_count(data$text, "\\?")
exclamationmarks=str_count(data$text, "\\!")


######################
# combine and export #
######################

combined <- cbind(data, sentiment_df, start_w_was, start_w_wer, start_w_wo, start_w_wann,
                  start_w_warum, start_w_wie, number_of_words, mean_word_length,
                  questionmarks, exclamationmarks)

combined$share_neg_words <- combined$negative/combined$number_of_words
combined$share_pos_words <- combined$positive/combined$number_of_words
combined$negative <- NULL
combined$positive <- NULL
combined$text <- NULL

write_dta(combined, "text_stats.dta", version = 14)




















